Take-home Exercise 1

Yu Yiling https://www.linkedin.com/in/yiling-yu/ (School of Computing and Information System)https://www.smu.edu.sg
2021-09-09

Take-home Exercise 1 requirements:

Things to clarify:

1. Install packages

packages = c('sf', 'tidyverse','readxl','maptools', 'raster','spatstat', 'tmap')
for (p in packages){
  if(!require(p, character.only = T)){
    install.packages(p)
  }
  library(p,character.only = T)
}

2. Aspatial data preparation

Import data, clean data and create dataframes

data_name_list <- list(
  "Standar Kelurahan Data Corona (31 Maret 2020 Pukul 08.00)", 
  "Standar Kelurahan Data Corona (30 April 2020 Pukul 09.00)", 
  "Standar Kelurahan Data Corona (31 MEI 2020 Pukul 09.00)",
  "Standar Kelurahan Data Corona (30 Juni 2020 Pukul 09.00)",
  "Standar Kelurahan Data Corona (31 Juli 2020 Pukul 09.00)",
  "Standar Kelurahan Data Corona (31 Agustus 2020 Pukul 10.00)",
  "Standar Kelurahan Data Corona (30 September 2020 Pukul 10.00)",
  "Standar Kelurahan Data Corona (31 Oktober 2020 Pukul 10.00)",
  "Standar Kelurahan Data Corona (30 November 2020 Pukul 10.00)",
  "Standar Kelurahan Data Corona (31 Desember 2020 Pukul 10.00)",
  "Standar Kelurahan Data Corona (30 Januari 2021 Pukul 10.00)",
  "Standar Kelurahan Data Corona (28 Februari 2021 Pukul 10.00)",
  "Standar Kelurahan Data Corona (31 Maret 2021 Pukul 10.00)",
  "Standar Kelurahan Data Corona (30 April 2021 Pukul 10.00)",
  "Standar Kelurahan Data Corona (31 Mei 2021 Pukul 10.00)",
  "Standar Kelurahan Data Corona (30 Juni 2021 Pukul 10.00)",
  "Standar Kelurahan Data Corona (31 Juli 2021 Pukul 10.00)")

data_refer_list <- list("03_2020", "04_2020", "05_2020", "06_2020", "07_2020", "08_2020", "09_2020", "10_2020", "11_2020", "12_2020", "01_2021", "02_2021", "03_2021", "04_2021", "05_2021", "06_2021", "07_2021")

for (x in 1:length(data_name_list)) {
  path = paste("data/",data_name_list[x],".xlsx", sep = "")
  df= read_excel(path, sheet = "data")
  # from March 2020 to June 2020 raw data excels have double "ID_KEL"
  if (x < 5) {
    df$ID_KEL...2 <- NULL
    names(df)[names(df) == 'ID_KEL...1'] <- 'ID_KEL'
  } 
  # from July 2020 onward raw data excels have double "Meninggal"
  else if (x == 5) {
    df$Meninggal...21 <- NULL
    names(df)[names(df) == 'Meninggal...26'] <- 'Meninggal'
  }
  else if (x == 6) {
    df$Meninggal...23 <- NULL
    names(df)[names(df) == 'Meninggal...28'] <- 'Meninggal'
  }
  else if (x == 7) {
    df$Meninggal...24 <- NULL
    names(df)[names(df) == 'Meninggal...29'] <- 'Meninggal'
  }
  else if (x == 8 | x == 9 | x == 10) {
    df$Meninggal...25 <- NULL
    names(df)[names(df) == 'Meninggal...30'] <- 'Meninggal'
  }
  else if (x > 10) {
    df$Meninggal...26 <- NULL
    names(df)[names(df) == 'Meninggal...31'] <- 'Meninggal'
  }
  
  df <- df[,c("ID_KEL", "Nama_provinsi", "nama_kota", "nama_kecamatan", "nama_kelurahan", "POSITIF", "Meninggal")]
  df$'month' <- data_refer_list[x]
  df <- df[-c(1), ]
  df <- df[(df$Nama_provinsi=="DKI JAKARTA"),]
  df_name = paste("df_", data_refer_list[x], sep = "")
  assign(df_name, df)
}

Merge dataframes into a big dataframe

temp_bind_df <- rbind(df_03_2020,df_04_2020)
temp_bind_df <- rbind(temp_bind_df,df_05_2020)
temp_bind_df <- rbind(temp_bind_df,df_06_2020)
temp_bind_df <- rbind(temp_bind_df,df_07_2020)
temp_bind_df <- rbind(temp_bind_df,df_08_2020)
temp_bind_df <- rbind(temp_bind_df,df_09_2020)
temp_bind_df <- rbind(temp_bind_df,df_10_2020)
temp_bind_df <- rbind(temp_bind_df,df_11_2020)
temp_bind_df <- rbind(temp_bind_df,df_12_2020)
temp_bind_df <- rbind(temp_bind_df,df_01_2021)
temp_bind_df <- rbind(temp_bind_df,df_02_2021)
temp_bind_df <- rbind(temp_bind_df,df_03_2021)
temp_bind_df <- rbind(temp_bind_df,df_04_2021)
temp_bind_df <- rbind(temp_bind_df,df_05_2021)
temp_bind_df <- rbind(temp_bind_df,df_06_2021)
binded_df <- rbind(temp_bind_df,df_07_2021)

3. Geospatial data preperation

Import data, transform projection and create dataframe

DJ = st_read(dsn = "data", 
                  layer = "BATAS_DESA_DESEMBER_2019_DUKCAPIL_DKI_JAKARTA")
Reading layer `BATAS_DESA_DESEMBER_2019_DUKCAPIL_DKI_JAKARTA' from data source `C:\yiling-yu\IS415_Blog\_take_home_exercises\Take-home Exercise 1\data' 
  using driver `ESRI Shapefile'
Simple feature collection with 269 features and 161 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 106.3831 ymin: -6.370815 xmax: 106.9728 ymax: -5.184322
Geodetic CRS:  WGS 84
#EPSG for DGN95 / Indonesia TM-3 zone 54.1: 23845
DJ_sf <- st_transform(DJ, crs = 23845)
st_geometry(DJ_sf)
Geometry set for 269 features 
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -3691981 ymin: 663887.8 xmax: -3606237 ymax: 815440.9
Projected CRS: DGN95 / Indonesia TM-3 zone 54.1
First 5 geometries:

Exclude all the outer islands from the DKI Jakarta sf data frame

have a look at the map to identify outer islands

tmap_mode('view')
tm_shape(DJ_sf) +
  tm_polygons()

remove the rows that have “KAB_KOTA” == “KEPULAUAN SERIBU”

DJ_sf <- DJ_sf[!(DJ_sf$KAB_KOTA == "KEPULAUAN SERIBU"),]
DJ_sf <- DJ_sf %>%
  drop_na()

have a look at the map again to confirm islands are excluded

tmap_mode('plot')
tm_shape(DJ_sf) +
  tm_polygons()

Retain the first nine fields in the DKI Jakarta sf data frame.

  DJ_sf = DJ_sf[,c(1:9)]
  #DJ_sf$geometry <- NULL

4. GEOSPATIAL DATA INTEGRATION

make sure value can match before performing georelational join

nama_kelurahan_values <-  unique(binded_df[c("nama_kelurahan")])

DESA_KELUR_values <- unique(DJ_sf[c("DESA_KELUR")]) %>%
  st_set_geometry(NULL)

no_match <- list()
for (x in nama_kelurahan_values$nama_kelurahan) {
  if (x %in% DESA_KELUR_values$DESA_KELUR == FALSE){
    no_match <- append(no_match,x)
  }
}

no_match
[[1]]
[1] "PINANG RANTI"

[[2]]
[1] "BALE KAMBANG"

[[3]]
[1] "PAL MERIAM"

[[4]]
[1] "JATI PULO"

[[5]]
[1] "KALI BARU"

[[6]]
[1] "RAWA JATI"

[[7]]
[1] "KERENDANG"

[[8]]
[1] "KAMPUNG TENGAH"

[[9]]
[1] "KRAMAT JATI"

[[10]]
[1] "HALIM PERDANA KUSUMAH"

[[11]]
[1] "P. HARAPAN"

[[12]]
[1] "P. KELAPA"

[[13]]
[1] "P. PANGGANG"

[[14]]
[1] "P. PARI"

[[15]]
[1] "P. TIDUNG"

[[16]]
[1] "UNTUNG JAWA"

[[17]]
[1] "PULAU HARAPAN"

[[18]]
[1] "PULAU KELAPA"

[[19]]
[1] "PULAU PANGGANG"

[[20]]
[1] "PULAU PARI"

[[21]]
[1] "PULAU TIDUNG"

[[22]]
[1] "PULAU UNTUNG JAWA"

*found many sub-district names not matched

change not-matched sub-district names

for (x in no_match) {
  if (x == "KERENDANG" ){
    binded_df$nama_kelurahan[binded_df$nama_kelurahan == x] <- "KRENDANG"
  }
  else if (x == "KAMPUNG TENGAH" ){
    binded_df$nama_kelurahan[binded_df$nama_kelurahan == x] <- "TENGAH"
  }
  else if (x == "HALIM PERDANA KUSUMAH" ){
    binded_df$nama_kelurahan[binded_df$nama_kelurahan == x] <- "HALIM PERDANA KUSUMA"
  }
  else if (x == "P. HARAPAN" | x == "P. KELAPA" | x == "P. PANGGANG" | x == "P. PARI" | x == "P. TIDUNG" | x == "UNTUNG JAWA" | x == "PULAU HARAPAN" | x == "PULAU KELAPA" | x == "PULAU PANGGANG" | x == "PULAU PARI"| x == "PULAU TIDUNG" | x == "PULAU UNTUNG JAWA"){
    binded_df<-binded_df[!(binded_df$nama_kelurahan == x),]
  }
  else {
    binded_df$nama_kelurahan[binded_df$nama_kelurahan == x] <- str_replace_all(string=x, pattern=" ", repl="")
  }
}

check value match

nama_kelurahan_values <-  unique(binded_df[c("nama_kelurahan")])

DESA_KELUR_values <- unique(DJ_sf[c("DESA_KELUR")]) %>%
  st_set_geometry(NULL)

no_match <- list()
for (x in nama_kelurahan_values$nama_kelurahan) {
  if (x %in% DESA_KELUR_values$DESA_KELUR == FALSE){
    no_match <- append(no_match,x)
  }
}

no_match
list()

*all sub-district names matched

georelational join

DJ_covid <- left_join(DJ_sf, binded_df,
                      by = c("DESA_KELUR" = "nama_kelurahan"))

colSums(is.na(DJ_covid))
     OBJECT_ID      KODE_DESA           DESA           KODE 
             0              0              0              0 
      PROVINSI       KAB_KOTA      KECAMATAN     DESA_KELUR 
             0              0              0              0 
    JUMLAH_PEN         ID_KEL  Nama_provinsi      nama_kota 
             0              0              0              0 
nama_kecamatan        POSITIF      Meninggal          month 
             0              0              0              0 
      geometry 
             0 

Calculate the cumulative confirmed cases rate (i.e. cases per 10000 population) and the cumulative death rate by month.

#column names
#POSITIF (cumulative confirmed cases)
#Meninggal (cumulative death cases)
#JUMLAH_PEN (Total Population)

DJ_covid <- DJ_covid %>%
  mutate(`POSITIF%` = (`POSITIF`
/`JUMLAH_PEN`)*10000) %>%
  mutate(`Meninggal%` = (`Meninggal`
/`JUMLAH_PEN`)*10000)

Save data to rds format and push to github

cleaned_df <- write_rds(DJ_covid, "data/DJ_covid.rds")

5. Import rds data

DJ_covid <- read_rds("data/DJ_covid.rds")

head(DJ_covid, n=5) 

stopped here

#simple plot
test <- DJ_covid %>%
  filter(month=="03_2020")

#tmap_mode("view")
#qtm(test, 
#    fill = "POSITIF%")



tmap_mode("view")+
tm_shape(test)+
tm_fill("POSITIF%", breaks = c(0, 4, 13, 53, 119, 270, 395, 3808), 
legend.is.portrait = TRUE) +
tm_borders(alpha = 0.5)


#EDA? any histogram? ggplot?
#image for eval=false + title label

#hands-on and in-class exercise 3